Code adapted from WGCNA tutorials

(https://labs.genetics.ucla.edu/horvath/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html)

We begin by loading in the necessary packages and data.

library(WGCNA)
## ==========================================================================
## *
## *  Package WGCNA 1.51 loaded.
## *
## *    Important note: It appears that your system supports multi-threading,
## *    but it is not enabled within WGCNA in R. 
## *    To allow multi-threading within WGCNA with all available cores, use 
## *
## *          allowWGCNAThreads()
## *
## *    within R. Use disableWGCNAThreads() to disable threading if necessary.
## *    Alternatively, set the following environment variable on your system:
## *
## *          ALLOW_WGCNA_THREADS=<number_of_processors>
## *
## *    for example 
## *
## *          ALLOW_WGCNA_THREADS=4
## *
## *    To set the environment variable in linux bash shell, type 
## *
## *           export ALLOW_WGCNA_THREADS=4
## *
## *     before running R. Other operating systems or shells will
## *     have a similar command to achieve the same aim.
## *
## ==========================================================================
library(dplyr)
library(tidyr)
library(DESeq2)
library(Rtsne)
library(ggplot2)
library(topGO)
library(sva)
library(biomaRt)

load("~/Documents/git/unified_gene_expression/data/counts_processed.Rdata")
load("~/Documents/git/unified_gene_expression/data/core_metaData.Rdata")

expr = t(counts)
meta = core_tight

dim(expr)
## [1]   841 20330
dim(meta)
## [1] 9317    9

We left_join the meta sheet (for the sample_accession variable) and the expression matrix. We then subset the resulting matrix into RPE, Retina, and RPE+Retina tables.

sample_accession = rownames(expr)
expr.temp = as.data.frame(cbind(sample_accession, expr))

rownames(expr.temp)[1:5]
## [1] "E-MTAB-4377.RNA10" "E-MTAB-4377.RNA11" "E-MTAB-4377.RNA12"
## [4] "E-MTAB-4377.RNA13" "E-MTAB-4377.RNA14"
colnames(expr.temp)[1:5]
## [1] "sample_accession" "A1BG"             "A1CF"            
## [4] "A2M"              "A2ML1"
expr.meta = left_join(expr.temp, meta, by = "sample_accession") %>% dplyr::select(-run_accession) %>% distinct()
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
expr.temp$sample_accession = as.character(expr.temp$sample_accession)

expr.temp$sample_accession[1:5]
## [1] "E-MTAB-4377.RNA10" "E-MTAB-4377.RNA11" "E-MTAB-4377.RNA12"
## [4] "E-MTAB-4377.RNA13" "E-MTAB-4377.RNA14"
expr.meta$sample_accession[1:5]
## [1] "E-MTAB-4377.RNA10" "E-MTAB-4377.RNA11" "E-MTAB-4377.RNA12"
## [4] "E-MTAB-4377.RNA13" "E-MTAB-4377.RNA14"
identical(expr.temp$sample_accession, expr.meta$sample_accession)
## [1] TRUE
rownames(expr.meta) = expr.meta$sample_accession

expr.meta[1:5, 1:3]
##                    sample_accession     A1BG    A1CF
## E-MTAB-4377.RNA10 E-MTAB-4377.RNA10        9 6.00811
## E-MTAB-4377.RNA11 E-MTAB-4377.RNA11       25 14.0293
## E-MTAB-4377.RNA12 E-MTAB-4377.RNA12 22.99999 11.0124
## E-MTAB-4377.RNA13 E-MTAB-4377.RNA13       27 4.00845
## E-MTAB-4377.RNA14 E-MTAB-4377.RNA14       28 1.00068
class(expr.meta)
## [1] "data.frame"
table(expr.meta$Tissue)
## 
##   Adipose Tissue     Adrenal Gland             Blood      Blood Vessel  
##                20                10                20                30 
##            Brain            Breast             Colon         Esophagus  
##               130                10                20                30 
##            Heart             Liver              Lung            Muscle  
##                20                10                10                10 
##            Nerve          Pancreas         Pituitary    Salivary Gland  
##                10                10                10                10 
##             Skin   Small Intestine            Spleen           Stomach  
##                31                10                10                10 
##          Thyroid             Cornea  ENCODE Cell Line            Retina 
##                10                28               112               107 
##               RPE 
##                76
expr.retina = filter(expr.meta, Tissue == "Retina")
expr.rpe = filter(expr.meta, Tissue == "RPE")
expr.retina.rpe = filter(expr.meta, Tissue == "Retina" | Tissue == "RPE")

rownames(expr.retina) = expr.retina$sample_accession
rownames(expr.rpe) = expr.rpe$sample_accession
rownames(expr.retina.rpe) = expr.retina.rpe$sample_accession

expr.retina = expr.retina[, c(2:ncol(expr.retina), 1)]
expr.rpe = expr.rpe[, c(2:ncol(expr.rpe), 1)]
expr.retina.rpe = expr.retina.rpe[, c(2:ncol(expr.retina.rpe), 1)]

expr.retina[1:5, 1:5]
##                       A1BG    A1CF      A2M     A2ML1 A3GALT2
## E-MTAB-4377.RNA10        9 6.00811  2407.96  22.78949       5
## E-MTAB-4377.RNA11       25 14.0293 5608.974  44.42833       0
## E-MTAB-4377.RNA12 22.99999 11.0124     4419  15.05002       1
## E-MTAB-4377.RNA13       27 4.00845  914.999 11.280146       0
## E-MTAB-4377.RNA14       28 1.00068     1160   6.28242       1
expr.rpe[1:5, 1:5]
##                 A1BG             A1CF       A2M     A2ML1 A3GALT2
## SRS1054265  70.00001          16.0123   3488.61  33.25127       3
## SRS1054266       514  7.0022000134004        53  24.35175       3
## SRS1054267 343.00027 15.0113000190766     53.98  53.17001       3
## SRS1054268 307.00013          13.0007 292.99961  14.85531       0
## SRS1054269 235.00018          9.00208        65 25.132097       3
expr.retina.rpe[1:5, 1:5]
##                       A1BG    A1CF      A2M     A2ML1 A3GALT2
## E-MTAB-4377.RNA10        9 6.00811  2407.96  22.78949       5
## E-MTAB-4377.RNA11       25 14.0293 5608.974  44.42833       0
## E-MTAB-4377.RNA12 22.99999 11.0124     4419  15.05002       1
## E-MTAB-4377.RNA13       27 4.00845  914.999 11.280146       0
## E-MTAB-4377.RNA14       28 1.00068     1160   6.28242       1
dim(expr.retina)
## [1]   107 20338
dim(expr.rpe)
## [1]    76 20338
dim(expr.retina.rpe)
## [1]   183 20338
save(expr.retina, file = "~/Documents/git/unified_gene_expression/data/counts_retina.Rdata")
save(expr.rpe, file = "~/Documents/git/unified_gene_expression/data/counts_rpe.Rdata")
save(expr.retina.rpe, file = "~/Documents/git/unified_gene_expression/data/counts_retina_rpe.Rdata")

We first build the network with just the samples from the retina.

#####################################
##         Retina Network          ##
#####################################

meta.retina = expr.retina[, which(colnames(expr.retina) == "study_accession"):ncol(expr.retina)]
expr.retina = expr.retina[, 1:(which(colnames(expr.retina) == "study_accession") - 1)]

expr.retina[] = sapply(expr.retina, function(f) as.numeric(levels(f))[f])

save(expr.retina, file = "~/Documents/git/unified_gene_expression/data/expr_retina.Rdata")
save(meta.retina, file = "~/Documents/git/unified_gene_expression/data/meta_retina.Rdata")

We engage in some pre-processing of the data. We first filter out any genes for which at least 10% of samples have less than 10 counts. We then apply a variance stabilizing transformation and check to see that all the genes are satisfactory (in terms of variance and low expression).

min.expr.indices = which(colSums(expr.retina >= 10) >= 0.9*nrow(expr.retina))
expr.retina = expr.retina[, min.expr.indices]

expr.retina.var.stab = varianceStabilizingTransformation(round(as.matrix(expr.retina)))
save(expr.retina.var.stab, file = "~/Documents/git/unified_gene_expression/data/counts_retina_varStab.Rdata")

gsg = goodSamplesGenes(expr.retina.var.stab)
##  Flagging genes and samples with too many missing values...
##   ..step 1
gsg$allOK
## [1] TRUE

We perform hierarchical cluster (using average linkage) on the samples to look for any outliers. We notice that the samples are clustering based on which study they came from.

sampleTree = hclust(dist(expr.retina.var.stab), method = "average")

# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(as.numeric(factor(meta.retina$study_accession)), signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree, traitColors,
                    groupLabels = "study_accession",
                    main = "Sample dendrogram and study source (study_accesssion)")

We find in the dendrogram plot that there are batch effects with the different data sources. We further investigate these batch issues using t-SNE (t-Stochastic Neighbor Embedding).

set.seed(47)

perplexities = seq(10, 30, by = 10)

for(p in perplexities){
  
  tsne_out = Rtsne(as.matrix(expr.retina.var.stab), perplexity = p, check_duplicates = F, theta = 0.0)
  tsne_plot = data.frame(tsne_out$Y)
  
  p1 = tsne_plot %>%
    ggplot(aes(x=X1,y=X2,colour=factor(meta.retina$study_accession))) + 
    geom_point(size=4) + ggtitle(paste0("t-sne. Perplexity = ", p))
  print(p1)
  
  p2 = tsne_plot %>%
    ggplot(aes(x=X1,y=X2,colour=factor(meta.retina$Sub_Tissue))) + 
    geom_point(size=4) + ggtitle(paste0("t-sne. Perplexity = ", p))
  print(p2)
}

We then use Combat (from the sva package) to adjust for the batch issues (fetal vs. adult, study source).

expr.retina.combat = as.data.frame(t(expr.retina.var.stab))

# Remove the data from studies that have very few observations
small.study.indices = which(meta.retina$study_accession %in% c("SRP002881", "SRP015336"))
expr.retina.combat = expr.retina.combat[, -(small.study.indices)]
meta.retina.combat = meta.retina[-(small.study.indices), ]

meta.retina.combat.mergeVar = meta.retina.combat %>%
  unite(study_subTissue, study_accession, Sub_Tissue)

modCombat = model.matrix(~1, data = meta.retina.combat.mergeVar)
combat_expr = t(ComBat(dat = expr.retina.combat, batch = factor(meta.retina.combat.mergeVar$study_subTissue), 
                     mod = modCombat, par.prior=TRUE, prior.plots=FALSE))
## Found 4 batches
## Adjusting for 0 covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

Following batch correction with Combat, we see that the batch issues are resolved, as clustering is minimal in t-SNE.

set.seed(47)

perplexities = seq(10, 30, by = 10)

for(p in perplexities){
  
  tsne_out = Rtsne(as.matrix(combat_expr), perplexity = p, check_duplicates = F, theta = 0.0)
  tsne_plot = data.frame(tsne_out$Y)
  
  p1 = tsne_plot %>%
    ggplot(aes(x=X1,y=X2,colour=factor(meta.retina.combat.mergeVar$study_subTissue))) + 
    geom_point(size=4) + ggtitle(paste0("t-sne (post-Combat adjustment). Perplexity = ", p))
  print(p1)
  
  p2 = tsne_plot %>%
    ggplot(aes(x=X1,y=X2,colour=factor(meta.retina.combat.mergeVar$study_subTissue))) + 
    geom_point(size=4) + ggtitle(paste0("t-sne (post-Combat adjustment). Perplexity = ", p))
  print(p2)
}

Following batch correction, we see that the distinct clusters are no longer present. Now we recluster the samples with hierarchical clustering (average linkage). The dendrogram shows that the batch issues have been resolved (at least partially).

sampleTree.combat = hclust(dist(combat_expr), method = "average")

# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors = numbers2colors(as.numeric(factor(meta.retina.combat.mergeVar$study_subTissue)), signed = FALSE);
# Plot the sample dendrogram and the colors underneath.
plotDendroAndColors(sampleTree.combat, traitColors,
                    groupLabels = "study_accession",
                    main = "Sample dendrogram and trait heatmap")

Now that we have resolved the batch effect issue, we can resume building a WGCNA network, exploring both signed and unsigned networks. We first begin with automatic network building.

options(stringsAsFactors = FALSE)

####################################
##           Unsigned             ##
####################################

# Choose a set of soft-thresholding powers
powers.unsigned = c(c(1:10), seq(from = 12, to=30, by=2))
# Call the network topology analysis function
sft.unsigned = pickSoftThreshold(combat_expr, powerVector = powers.unsigned, networkType = "unsigned")
## Warning: executing %dopar% sequentially: no parallel backend registered
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.7680  2.330          0.859 6020.00  6390.000 8440.0
## 2      2   0.2570  0.551          0.593 3280.00  3440.000 5820.0
## 3      3   0.0426 -0.197          0.479 1980.00  1990.000 4250.0
## 4      4   0.2940 -0.595          0.637 1270.00  1210.000 3230.0
## 5      5   0.4650 -0.842          0.741  860.00   757.000 2520.0
## 6      6   0.5520 -1.050          0.787  601.00   491.000 2010.0
## 7      7   0.6060 -1.200          0.818  433.00   328.000 1640.0
## 8      8   0.6570 -1.290          0.850  319.00   222.000 1350.0
## 9      9   0.6940 -1.370          0.874  240.00   154.000 1120.0
## 10    10   0.7250 -1.420          0.893  183.00   109.000  940.0
## 11    12   0.7760 -1.480          0.929  111.00    57.000  676.0
## 12    14   0.8070 -1.530          0.950   70.90    31.600  502.0
## 13    16   0.8180 -1.590          0.960   46.60    18.200  383.0
## 14    18   0.8240 -1.630          0.967   31.60    10.800  296.0
## 15    20   0.8330 -1.670          0.972   21.90     6.550  232.0
## 16    22   0.8460 -1.680          0.978   15.50     4.110  184.0
## 17    24   0.8570 -1.690          0.982   11.20     2.580  147.0
## 18    26   0.8600 -1.700          0.985    8.17     1.660  118.0
## 19    28   0.8670 -1.710          0.987    6.05     1.100   95.9
## 20    30   0.8770 -1.720          0.992    4.54     0.732   78.7
# With a power of 14 we get
# Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
# 14     0.8070   -1.530        0.950   70.90    31.600  502.0

####################################
##           Signed             ##
####################################

# Choose a set of soft-thresholding powers
powers.signed = c(c(1:10), seq(from = 12, to=30, by=2))
# Call the network topology analysis function
sft.signed = pickSoftThreshold(combat_expr, powerVector = powers.signed, networkType = "signed")
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.8860  9.380          0.938  9800.0   10000.0  11100
## 2      2   0.8060  4.030          0.897  7180.0    7440.0   9080
## 3      3   0.6770  2.130          0.831  5380.0    5610.0   7540
## 4      4   0.4170  1.050          0.692  4120.0    4300.0   6380
## 5      5   0.1660  0.521          0.592  3210.0    3320.0   5450
## 6      6   0.0136  0.133          0.524  2530.0    2600.0   4700
## 7      7   0.0190 -0.157          0.539  2030.0    2040.0   4090
## 8      8   0.1010 -0.360          0.601  1640.0    1620.0   3580
## 9      9   0.2010 -0.519          0.659  1340.0    1300.0   3150
## 10    10   0.3010 -0.652          0.710  1110.0    1040.0   2790
## 11    12   0.4470 -0.893          0.769   773.0     686.0   2230
## 12    14   0.5350 -1.090          0.806   554.0     461.0   1810
## 13    16   0.5960 -1.230          0.832   406.0     317.0   1490
## 14    18   0.6470 -1.320          0.860   304.0     221.0   1240
## 15    20   0.6860 -1.390          0.880   231.0     156.0   1040
## 16    22   0.7270 -1.430          0.905   178.0     112.0    877
## 17    24   0.7550 -1.470          0.921   139.0      81.5    746
## 18    26   0.7690 -1.500          0.929   110.0      60.1    638
## 19    28   0.7950 -1.510          0.944    87.8      44.9    549
## 20    30   0.8030 -1.550          0.950    70.7      34.0    478
# With a power of 30 we get
# Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
# 30    0.8030    -1.550      0.950    70.7      34.0    478

We achieve scale-free topology indices of 0.8 for powers of 14 and 30 for unsigned and signed networks, respectively. We build the networks using these power parameters. Upon examining the resulting gene dendrograms, it’s clear that the unsigned network is superior.

net.unsigned = blockwiseModules(combat_expr, power = 14, maxBlockSize = 14000,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.1,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       verbose = 3)
##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
##  ..Working on block 1 .
##     TOM calculation: adjacency..
## adjacency: replaceMissing: 0
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication..
##     ..normalization..
##     ..done.
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking modules for statistical meaningfulness..
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 0.1
##        Calculating new MEs...
net.signed = blockwiseModules(combat_expr, power = 30, maxBlockSize = 14000,
                       TOMType = "signed", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.05,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       verbose = 3)
##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
##  ..Working on block 1 .
##     TOM calculation: adjacency..
## adjacency: replaceMissing: 0
##     ..will not use multithreading.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication..
##     ..normalization..
##     ..done.
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking modules for statistical meaningfulness..
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 0.05
##        Calculating new MEs...
save(net.unsigned, file = "~/Documents/git/unified_gene_expression/data/retina_net_unsigned.Rdata")
save(net.signed, file = "~/Documents/git/unified_gene_expression/data/retina_net_signed.Rdata")

# Examining number of modules
table(net.unsigned$colors)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  643 6464 2036 1087  749  449  283  255  254  244  187  161  152  122  106 
##   15   16   17   18   19   20   21   22   23 
##   91   84   79   75   62   61   49   48   33
# Convert labels to colors for plotting
mergedColors = labels2colors(net.unsigned$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net.unsigned$dendrograms[[1]], mergedColors[net.unsigned$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

# Examining number of modules
table(net.signed$colors)
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 6892 3014  764  278  266  206  194  190  189  181  148  141  128  115  115 
##   15   16   17   18   19   20   21   22   23   24   25   26   27 
##  110  109  102   94   80   76   61   58   56   56   56   53   42
# Convert labels to colors for plotting
mergedColors = labels2colors(net.signed$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net.signed$dendrograms[[1]], mergedColors[net.signed$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

Building a network step-by-step

We now consider building a network using the TOM (Topological Overlap Matrix) and hierarchical clustering, as opposed to the automatic network construction performed above.

softPower = 14
adjacency = adjacency(combat_expr, power = softPower)

# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency)
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
dissTOM = 1-TOM

# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)

# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                deepSplit = 2, pamRespectsDendro = FALSE,
                minClusterSize = minModuleSize);
##  ..cutHeight not given, setting it to 0.999  ===>  99% of the (truncated) height range in dendro.
##  ..done.
table(dynamicMods)
## dynamicMods
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  259 1587  858  802  557  511  446  405  331  331  284  265  221  219  215 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##  214  210  206  195  185  184  176  171  169  166  158  158  156  151  151 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##  151  150  149  148  146  139  137  134  130  129  126  118  117  116  105 
##   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59 
##   99   97   97   97   96   95   92   89   88   87   82   78   76   76   76 
##   60   61   62   63   64   65   66   67   68 
##   67   66   65   62   58   53   50   49   43
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
##   antiquewhite4         bisque4           black            blue 
##              66              97             405             858 
##           brown          brown4          coral1          coral2 
##             802              97              67              65 
##            cyan       darkgreen        darkgrey     darkmagenta 
##             215             171             166             146 
##  darkolivegreen      darkorange     darkorange2         darkred 
##             148             158              99             176 
##   darkseagreen4   darkslateblue   darkturquoise     floralwhite 
##              76              97             169             105 
##           green     greenyellow            grey          grey60 
##             511             265             259             206 
##       honeydew1           ivory  lavenderblush3       lightcyan 
##              76             116              76             210 
##      lightcyan1      lightgreen      lightpink4 lightsteelblue1 
##             117             195              78             118 
##     lightyellow         magenta          maroon    mediumorchid 
##             185             331              82              62 
##   mediumpurple3    midnightblue    navajowhite2          orange 
##             126             214              87             158 
##      orangered3      orangered4   paleturquoise  palevioletred3 
##              43             129             150              88 
##            pink            plum           plum1           plum2 
##             331              49             130              96 
##          purple             red       royalblue     saddlebrown 
##             284             446             184             151 
##          salmon         salmon4         sienna3         skyblue 
##             219              89             139             151 
##        skyblue1        skyblue2        skyblue3       steelblue 
##              50              58             134             151 
##             tan        thistle1        thistle2       turquoise 
##             221              92              95            1587 
##          violet           white          yellow         yellow4 
##             149             156             557              53 
##     yellowgreen 
##             137
# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.0005,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")

We then calculate the eigengenes for the different modules.

# Calculate eigengenes
MEList = moduleEigengenes(combat_expr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")

MEDissThres = 0.05   # used to be 0.15
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")

# Call an automatic merging function
merge = mergeCloseModules(combat_expr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
##  mergeCloseModules: Merging modules whose distance is less than 0.05
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 69 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 58 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 56 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 56 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1;
MEs = mergedMEs;
# Save module colors and labels for use in subsequent parts
save(MEs, moduleLabels, moduleColors, geneTree, file = "~/Documents/git/unified_gene_expression/data/networkConstruction-stepByStep.RData")

Building a network step-by-step

We now build a signed network.

powers = c(1:10, seq(from = 12, to = 20, by = 2))
sft = pickSoftThreshold(combat_expr, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 3248.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 3248 of 13774
##    ..working on genes 3249 through 6496 of 13774
##    ..working on genes 6497 through 9744 of 13774
##    ..working on genes 9745 through 12992 of 13774
##    ..working on genes 12993 through 13774 of 13774
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.7680  2.330          0.859  6020.0   6390.00   8440
## 2      2   0.2570  0.551          0.593  3280.0   3440.00   5820
## 3      3   0.0426 -0.197          0.479  1980.0   1990.00   4250
## 4      4   0.2940 -0.595          0.637  1270.0   1210.00   3230
## 5      5   0.4650 -0.842          0.741   860.0    757.00   2520
## 6      6   0.5520 -1.050          0.787   601.0    491.00   2010
## 7      7   0.6060 -1.200          0.818   433.0    328.00   1640
## 8      8   0.6570 -1.290          0.850   319.0    222.00   1350
## 9      9   0.6940 -1.370          0.874   240.0    154.00   1120
## 10    10   0.7250 -1.420          0.893   183.0    109.00    940
## 11    12   0.7760 -1.480          0.929   111.0     57.00    676
## 12    14   0.8070 -1.530          0.950    70.9     31.60    502
## 13    16   0.8180 -1.590          0.960    46.6     18.20    383
## 14    18   0.8240 -1.630          0.967    31.6     10.80    296
## 15    20   0.8330 -1.670          0.972    21.9      6.55    232
{# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.80,col="red")}

# Mean connectivity as a function of the soft-thresholding power
{plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers,col="red")}

softPower = 14
adjacency = adjacency(combat_expr, power = softPower)

# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency)
## ..connectivity..
## ..matrix multiplication..
## ..normalization..
## ..done.
dissTOM = 1-TOM

# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average");
# Plot the resulting clustering tree (dendrogram)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)

# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 30;
# Module identification using dynamic tree cut:
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                deepSplit = 2, pamRespectsDendro = FALSE,
                minClusterSize = minModuleSize);
##  ..cutHeight not given, setting it to 0.999  ===>  99% of the (truncated) height range in dendro.
##  ..done.
table(dynamicMods)
## dynamicMods
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  259 1587  858  802  557  511  446  405  331  331  284  265  221  219  215 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##  214  210  206  195  185  184  176  171  169  166  158  158  156  151  151 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##  151  150  149  148  146  139  137  134  130  129  126  118  117  116  105 
##   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59 
##   99   97   97   97   96   95   92   89   88   87   82   78   76   76   76 
##   60   61   62   63   64   65   66   67   68 
##   67   66   65   62   58   53   50   49   43
# Convert numeric lables into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
##   antiquewhite4         bisque4           black            blue 
##              66              97             405             858 
##           brown          brown4          coral1          coral2 
##             802              97              67              65 
##            cyan       darkgreen        darkgrey     darkmagenta 
##             215             171             166             146 
##  darkolivegreen      darkorange     darkorange2         darkred 
##             148             158              99             176 
##   darkseagreen4   darkslateblue   darkturquoise     floralwhite 
##              76              97             169             105 
##           green     greenyellow            grey          grey60 
##             511             265             259             206 
##       honeydew1           ivory  lavenderblush3       lightcyan 
##              76             116              76             210 
##      lightcyan1      lightgreen      lightpink4 lightsteelblue1 
##             117             195              78             118 
##     lightyellow         magenta          maroon    mediumorchid 
##             185             331              82              62 
##   mediumpurple3    midnightblue    navajowhite2          orange 
##             126             214              87             158 
##      orangered3      orangered4   paleturquoise  palevioletred3 
##              43             129             150              88 
##            pink            plum           plum1           plum2 
##             331              49             130              96 
##          purple             red       royalblue     saddlebrown 
##             284             446             184             151 
##          salmon         salmon4         sienna3         skyblue 
##             219              89             139             151 
##        skyblue1        skyblue2        skyblue3       steelblue 
##              50              58             134             151 
##             tan        thistle1        thistle2       turquoise 
##             221              92              95            1587 
##          violet           white          yellow         yellow4 
##             149             156             557              53 
##     yellowgreen 
##             137
# Plot the dendrogram and colors underneath
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.0003,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")

We then calculate the eigengenes.

# Calculate eigengenes
MEList = moduleEigengenes(combat_expr, colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs);
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
{plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")

MEDissThres = 0.1   # used to be 0.15
# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")}

# Call an automatic merging function
merge = mergeCloseModules(combat_expr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
##  mergeCloseModules: Merging modules whose distance is less than 0.1
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 69 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 34 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 29 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 27 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 26 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 25 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 25 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
table(mergedColors)
## mergedColors
##         bisque4            cyan     darkmagenta     darkorange2 
##              97             215            5345            2075 
##         darkred   darkseagreen4   darkslateblue   darkturquoise 
##            1801              76             476             265 
##           green            grey       lightcyan      lightcyan1 
##             795             259             210             268 
##      lightpink4 lightsteelblue1          maroon    mediumorchid 
##              78             118             361              62 
##      orangered4          salmon         skyblue        skyblue1 
##             129             219             151              50 
##       steelblue        thistle2           white         yellow4 
##             283              95             156              53 
##     yellowgreen 
##             137
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

Network Visualization

We obtain the TOM and visualize the network using a heat map. Plotting on all genes is very computationally intensive, so here we just compute the TOM on a randomly selected subset of 400 genes.

# Transform dissTOM with a power to make moderately strong connections more visible in the heatmap
plotTOM = dissTOM^7;
# Set diagonal to NA for a nicer plot
diag(plotTOM) = NA;
# Call the plot function

# plotting on 
#TOMplot(plotTOM, geneTree, mergedColors, main = "Network heatmap plot, all genes")


nSelect = 400
# For reproducibility, we set the random seed
set.seed(47);
select = sample(ncol(combat_expr), size = nSelect);
selectTOM = dissTOM[select, select];
# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = mergedColors[select];
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

Pathway Analysis (Retina Network)

library(GO.db)
library(AnnotationDbi)
library(org.Hs.eg.db)


# Load annotation file
load("~/Documents/git/unified_gene_expression/data/gencode_v25_annotation.Rdata")
# Read in the probe annotation
annot = gencode_v25_annotation %>% dplyr::select(-Transcript.ID) %>% distinct()
n_distinct(annot$Gene.ID)
## [1] 20378
n_distinct(annot$Gene.Name)
## [1] 20330
# Removing alternative splicing artifacts
annot[, 'Gene.ID'] = gsub("\\.[^.]*$", "", annot$Gene.ID)
annot = annot %>% distinct()
dim(annot)
## [1] 20360     2
# Match probes in the data set to the probe IDs in the annotation file 
probes = colnames(combat_expr)
probes2annot = which(annot$Gene.Name %in% probes)
# Get the corresponding Gene IDs
allGeneIDs = annot[probes2annot, ]
# (For now - need new method) eliminate redundancy for gene names
allGeneIDs = allGeneIDs[!duplicated(allGeneIDs$Gene.Name),]
dim(allGeneIDs)
## [1] 13774     2
mart <- useMart("ensembl")
datasets <- listDatasets(mart)
mart<-useDataset("hsapiens_gene_ensembl",mart)
entrez_ids = getBM(attributes=c("ensembl_gene_id", "entrezgene"),filters="ensembl_gene_id",values=allGeneIDs, mart=mart)

test = merge(allGeneIDs,entrez_ids,by.x="Gene.ID",by.y="ensembl_gene_id") %>% distinct()

moduleColors = labels2colors(net.unsigned$colors)

# $ Choose interesting modules
intModules = c("brown", "green", "red")
for (module in intModules)
{
  # Select module probes
  modGenes = (moduleColors==module)
  # Get their entrez ID codes
  modGeneIDs = allGeneIDs[modGenes];
  # Write them into a file
  fileName = paste("~/Documents/git/unified_gene_expression/results/GeneIDs-", module, ".txt", sep="");
  write.table(as.data.frame(modGeneIDs), file = fileName,
              row.names = FALSE, col.names = FALSE)
}
# As background in the enrichment analysis, we will use all probes in the analysis.
fileName = paste("~/Documents/git/unified_gene_expression/results/GeneIDs-all.txt", sep="");
write.table(as.data.frame(allGeneIDs), file = fileName,
            row.names = FALSE, col.names = FALSE)

Issue to resolve: converting from ensembl gene IDs to entrez gene IDs.

#GOenr = GOenrichmentAnalysis(moduleColors, allGeneIDs, organism = "human", nBestP = 10)

Moving forward